from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn import cross_validation
import matplotlib.pylab as plt
import pandas as pd
import numpy as np
churn= pd.read_csv("C:/Users/nawres.jguirim.stg/presentationpython/dataSet/final.csv", sep=';', encoding = "ISO-8859-1")
y = churn['Flag_inactivite']
categoric=churn.select_dtypes(include=['object'])
drop=[ 'subscriber_activation_date', 'period_end','period_start']
categoric = categoric.drop(drop,axis=1)
num=churn.select_dtypes(include=['int64'])
drop2=['subscriber_id', 'Flag_inactivite']
num=num.drop(drop2,axis=1)
col_categoric_dummies = pd.get_dummies( categoric )
X = pd.concat([num, col_categoric_dummies],axis=1)
from sklearn.model_selection import train_test_split
############################################################
#################### random forest ####################
############################################################
Rank= [True,True,True,True,False,False,False,False,True,True,True,True
,True,False,True,True,True,True,True,True,True,True,True,True
,True,True,True,True,True,True,True,True,True,True,True,True
,True,True,True,True,True,True,True,True,True,True,True,False
,True,True,True,True,False,True,True,True,True,True,True,False
,False,True,True,True,True,True,True,True,True,True,True,False
,False,True,True,True,False,True,True,False,False,False,False,False
,False,False,False,False,False,False,False,True,False,False,False,False
,False,False,False,False,False,False,False,False,False,False,False,False
,False,False,False,False,False,True,False,False,False,False,False,False
,False,False,False,False,False,False,False,True,False,False,False,False
,False,False,False,False,False,False,False,False,False]
data={ 'Features': X.columns ,'Rank': Rank}
rank=pd.DataFrame( data)
li=rank.loc[rank.Rank ]
X_train, X_test, y_train, y_test = train_test_split(X[li.Features], y, test_size =0.3)
import time
start_time = time.time()
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier( n_estimators=300, max_depth=30,
min_samples_split=50,min_samples_leaf=25)
#n_estimators=the number of trees in the forest
random_forest.fit(X_train, y_train)
print("--- %s seconds ---" % (time.time() - start_time))
# Predict and get the accuracy score
from sklearn import metrics
predict_y_random_forest = random_forest.predict(X_test)
model_score_random_forest = random_forest.score(X_test, y_test)
preds_random_forest = random_forest.predict_proba(X_test)[:,1]
fp_random_forest, tp_random_forest, threshold_random_forest = metrics.roc_curve(y_test, preds_random_forest)
AUC_random_forest = metrics.auc(fp_random_forest, tp_random_forest)
#K-folds cross validaion
cv_score_random_forest = cross_validation.cross_val_score(random_forest, X_train , y_train, cv=3, scoring='roc_auc')
cv_mean_random_forest=np.mean(cv_score_random_forest)
#Print model report:
print ("\nModel Report")
print ("Accuracy : %.3g" % model_score_random_forest )
print ("AUC Score (Test): %0.3f" % AUC_random_forest )
print ("CV Score : Mean - %.4g | Std - %.4g | Min - %.4g | Max - %.4g" % (np.mean(cv_score_random_forest),
np.std(cv_score_random_forest),
np.min(cv_score_random_forest),
np.max(cv_score_random_forest)))
import math
y_hat_random_forest= random_forest.predict(X_test)
RSS_random_forest = ((y_hat_random_forest - y_test) ** 2).sum()
k=X.shape[1]
AIC_random_forest= 2*k - 2*math.log(RSS_random_forest )
print( "Akaike information criterion: " + str(AIC_random_forest ))
print('somme erreur carré ' +str(RSS_random_forest))
confusion_matrix_random_forest = metrics.confusion_matrix(y_test, predict_y_random_forest)
TN= confusion_matrix_random_forest[0, 0]
FP=confusion_matrix_random_forest[0, 1]
FN=confusion_matrix_random_forest[1, 0]
TP=confusion_matrix_random_forest[1, 1]
#false negative rate error type I
fnr_random_forest=FN/(FN+ TP)
#False negaive rate error type II
fpr_random_forest=FP/(FP+TN)
print ("Erreur type I: FN false negative rate: %.3g" % fnr_random_forest )
print ("Erreur type II: FP False Positive rate: %.3g" % fpr_random_forest )
print ("\n ")
print ("Confusion Matrix: ")
print (" Predicted")
print (" | 0 | 1 |")
print (" |-----|-----|")
print (" 0 | %3d | %3d |" % (confusion_matrix_random_forest[0, 0],
confusion_matrix_random_forest[0, 1]))
print ("Actual |-----|-----|")
print (" 1 | %3d | %3d |" % (confusion_matrix_random_forest[1, 0],
confusion_matrix_random_forest[1, 1]))
print (" |-----|-----|")
print ("\n ")
from sklearn.metrics import classification_report
print(classification_report(y_test,
predict_y_random_forest,
digits=3))
fig = plt.figure(figsize=(10,4), dpi=1600)
feat_imp_random_forest = pd.Series(random_forest.feature_importances_,li.Features)[1:70].sort_values(ascending=False)
feat_imp_random_forest.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fp_random_forest, tp_random_forest, 'b', label = 'AUC = %0.3f' % AUC_random_forest)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
################## MLP ##################
###########################################
import time
start_time = time.time()
from sklearn.decomposition import PCA
pca= PCA(n_components=37, whiten= True )
pca.fit(X)
X_pca = pca.transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size =0.3 )
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(30),max_iter=500)
mlp.fit(X_train,y_train)
MLPClassifier(activation='relu', alpha=0.001, batch_size='auto', beta_1=0.9,
beta_2=0.899, early_stopping=True , epsilon=1e-07,
hidden_layer_sizes=(30), learning_rate='invscaling',
learning_rate_init=0.1, max_iter=500, momentum=0.9,
nesterovs_momentum=True, power_t=0.5, random_state=None,
shuffle=True, solver='adam', tol=0.01, validation_fraction=0.1,
verbose=False, warm_start=False)
print("--- %s seconds ---" % (time.time() - start_time))
# Predict and get the accuracy score
from sklearn import metrics
predict_y_mlp = mlp.predict(X_test)
model_score_mlp = mlp.score(X_test, y_test)
# calculate the fpr and tpr for all thresholds of the classification
probs_mlp = mlp.predict_proba(X_test)
preds_mlp = probs_mlp[:,1]
fp_mlp, tp_mlp, threshold_mlp = metrics.roc_curve(y_test, preds_mlp)
AUC_mlp = metrics.auc(fp_mlp, tp_mlp)
#K-folds cross validaion
cv_score_mlp = cross_validation.cross_val_score(mlp, X_train , y_train, cv=5, scoring='roc_auc')
cv_mean_mlp=np.mean(cv_score_mlp)
#Print model report:
print ("\nModel Report")
print ("Accuracy : %.3g" % model_score_mlp )
print ("AUC Score (Test): %0.3f" % AUC_mlp )
print ("CV Score : Mean - %.4g | Std - %.4g | Min - %.4g | Max - %.4g" % (np.mean(cv_score_mlp),
np.std(cv_score_mlp),
np.min(cv_score_mlp),
np.max(cv_score_mlp)))
confusion_matrix_mlp = metrics.confusion_matrix(y_test, predict_y_mlp )
TN= confusion_matrix_mlp[0, 0]
FP=confusion_matrix_mlp[0, 1]
FN=confusion_matrix_mlp[1, 0]
TP=confusion_matrix_mlp[1, 1]
#false negative rate error type I
fnr_mlp=FN/(FN+ TP)
#False negaive rate error type II
fpr_mlp=FP/(FP+TN)
print ("Erreur type I: FP False Positive rate: %.3g" % fnr_mlp )
print ("Erreur type II: FP false negative rate: %.3g" % fpr_mlp )
print ("\n ")
print ("Confusion Matrix: ")
print (" Predicted")
print (" | 0 | 1 |")
print (" |------|------|")
print (" 0 | %3d | %3d |" % (confusion_matrix_mlp[0, 0],
confusion_matrix_mlp[0, 1]))
print ("Actual |------|------|")
print (" 1 | %3d | %3d |" % (confusion_matrix_mlp[1, 0],
confusion_matrix_mlp[1, 1]))
print (" |------|------|")
print ("\n ")
import math
y_hat_mlp= mlp.predict(X_test)
RSS_mlp = ((y_hat_mlp - y_test) ** 2).sum()
k=X.shape[1]
AIC_mlp= 2*k - 2*math.log(RSS_mlp)
print( "Akaike information criterion: " + str(AIC_mlp))
print ("\n ")
from sklearn.metrics import classification_report
print(classification_report(y_test,
predict_y_mlp,
digits=3))
predictionTrain = mlp.predict(X_train)
print("train !" + str(classification_report(y_train,predictionTrain , digits=3)))
fig = plt.figure(figsize=(15,4), dpi=1600)
feat_imp_gbm= pd.Series(gbm.feature_importances_, selected_features )[1:70].sort_values(ascending=False)
feat_imp_gbm.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fp_mlp, tp_mlp, 'b', label = 'AUC = %0.3f' % AUC_mlp)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
####################### SVM #####################
#######################################################
##################### regression #####################
######################################################
import time
start_time = time.time()
def correlation(dataset, threshold):
col_corr = set() # Set of all the names of deleted columns
corr_matrix = dataset.corr()
for i in range(len(corr_matrix.columns)):
for j in range(i):
if abs(corr_matrix.iloc[i, j]) >= threshold:
colname = corr_matrix.columns[i] # getting the name of column
col_corr.add(colname)
if colname in dataset.columns:
del dataset[colname] # deleting the column from the dataset
return (dataset)
X_corr= pd.DataFrame(correlation( X, .7))
Rank= [False,False,False,False,False,False,False,False,False,False,True,False
,False,False,False,False,False,False,False,False,True,True,True,True
,True,True,True,True,True,True,True,True,True,True,True,True
,False,False,False,False,False,False,False,True,False,True,False,True
,True,False,True,True,True,False,True,False,False,False,True,True
,True,False,True,False,True,True,True,True,True,True]
data={ 'Features': X.columns ,'Rank': Rank}
rank=pd.DataFrame( data)
li=rank.loc[rank.Rank ]
X_train, X_test, y_train, y_test = train_test_split(X_corr[li.Features], y, test_size =0.3)
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
print("--- %s seconds ---" % (time.time() - start_time))
# Predict and get the accuracy score
from sklearn import metrics
predict_y_logreg = logreg.predict(X_test)
model_score_logreg = logreg.score(X_test, y_test)
# calculate the fpr and tpr for all thresholds of the classification
probs_logreg = logreg.predict_proba(X_test)
preds_logreg = probs_logreg[:,1]
fp_logreg, tp_logreg, threshold_logreg = metrics.roc_curve(y_test, preds_logreg)
AUC_logreg = metrics.auc(fp_logreg, tp_logreg)
#K-folds cross validaion
cv_score_logreg = cross_validation.cross_val_score( logreg, X_train , y_train, cv=3, scoring='roc_auc')
cv_mean_logreg=np.mean(cv_score_logreg)
#Print model report:
print ("\nModel Report")
print ("Accuracy : %.3g" % model_score_logreg )
print ("AUC Score (Test): %0.3f" % AUC_logreg )
print ("CV Score : Mean - %.4g | Std - %.4g | Min - %.4g | Max - %.4g" % (np.mean(cv_score_logreg),
np.std(cv_score_logreg),
np.min(cv_score_logreg),
np.max(cv_score_logreg)))
import math
y_hat_logreg= logreg.predict(X_test)
RSS_logreg = ((y_hat_logreg - y_test) ** 2).sum()
k=X.shape[1]
AIC_logreg= 2*k - 2*math.log(RSS_logreg )
print( "Akaike information criterion: " + str(AIC_logreg ))
print('somme erreur carré ' +str(RSS_logreg))
confusion_matrix_logreg = metrics.confusion_matrix(y_test, predict_y_logreg)
TN= confusion_matrix_logreg[0, 0]
FP=confusion_matrix_logreg[0, 1]
FN=confusion_matrix_logreg[1, 0]
TP=confusion_matrix_logreg[1, 1]
#false negative rate error type I
fnr_logreg=FN/(FN+ TP)
#False negaive rate error type II
fpr_logreg=FP/(FP+TN)
print ("Erreur type I: FP False Positive rate: %.3g" % fnr_logreg )
print ("Erreur type II: FP false negative rate: %.3g" % fpr_logreg )
print ("\n ")
print ("Confusion Matrix: ")
print (" Predicted")
print (" | 0 | 1 |")
print (" |------|------|")
print (" 0 | %3d | %3d |" % (confusion_matrix_logreg[0, 0],
confusion_matrix_logreg[0, 1]))
print ("Actual |------|------|")
print (" 1 | %3d | %3d |" % (confusion_matrix_logreg[1, 0],
confusion_matrix_logreg[1, 1]))
print (" |------|------|")
print ("\n ")
from sklearn.metrics import classification_report
print(classification_report(y_test,
predict_y_logreg,
digits=3))
print("train ")
print(classification_report(y_train ,
logreg.predict(X_train),
digits=3))
#ROC Curve
plt.figure()
plt.plot(fp_logreg, tp_logreg, label='Logistic Regression (area = %0.3f)' % AUC_logreg)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
################################Gradient Boosting ################################
###################################################################################
import time
start_time = time.time()
##### for reexecute the code ######
Rank= [True,True,False,True,True,True,False,False,True,True,True,True
,True,False,True,True,False,False,True,True,True,True,True,False
,True,False,True,True,True,True,True,True,True,True,True,True
,True,True,False,False,True,False,True,True,True,True,True,False
,False,False,False,True,True,True,True,True,True,True,True,False
,False,True,False,True,False,True,True,True,True,True,True,False
,True,False,True,True,True,True,True,False,False,False,False,False
,False,False,False,False,False,True,True,True,False,False,False,True
,True,True,False,False,False,False,False,False,True,False,False,False
,False,True,False,True,False,True,False,False,False,False,False,False
,True,False,False,False,False,False,False,False,False,True,False,False
,False,False,False,False,False,False,False,False,False]
X = pd.concat([num, col_categoric_dummies],axis=1)
data={ 'Features': X.columns ,'Rank': Rank}
rank=pd.DataFrame( data)
li=rank.loc[rank.Rank ]
selected_features=li.Features
X_train, X_test, y_train, y_test = train_test_split(X[selected_features], y, test_size =0.3 , stratify=y)
import time
start_time = time.time()
from sklearn.ensemble import GradientBoostingClassifier #GBM algorithm
gbm = GradientBoostingClassifier( n_estimators=600)
gbm.fit(X_train, y_train)
print("--- %s seconds ---" % (time.time() - start_time))
# Predict and get the accuracy score
from sklearn import metrics
predict_y_gbm = gbm.predict(X_test)
model_score_gbm = gbm.score(X_test, y_test)
# calculate the fpr and tpr for all thresholds of the classification
probs_gbm = gbm.predict_proba(X_test)
preds_gbm = probs_gbm[:,1]
fp_gbm, tp_gbm, threshold_gbm = metrics.roc_curve(y_test, preds_gbm)
AUC_gbm = metrics.auc(fp_gbm, tp_gbm)
#K-folds cross validaion
cv_score_gbm = cross_validation.cross_val_score(gbm, X_train , y_train, cv=3, scoring='roc_auc')
cv_mean_gbm=np.mean(cv_score_gbm)
#Print model report:
print ("\nModel Report")
print ("Accuracy : %.3g" % model_score_gbm )
print ("AUC Score (Test): %0.3f" % AUC_gbm )
print ("CV Score : Mean - %.4g | Std - %.4g | Min - %.4g | Max - %.4g" % (np.mean(cv_score_gbm),
np.std(cv_score_gbm),
np.min(cv_score_gbm),
np.max(cv_score_gbm)))
confusion_matrix_gbm = metrics.confusion_matrix(y_test, predict_y_gbm)
import math
RSS_gbm = ((predict_y_gbm - y_test) ** 2).sum()
k=X.shape[1]
AIC_gbm= 2*k - 2*math.log(RSS_gbm )
print( "Akaike information criterion: " + str(AIC_gbm ))
print('somme erreur carré ' +str(RSS_gbm))
TN= confusion_matrix_gbm[0, 0]
FP=confusion_matrix_gbm[0, 1]
FN=confusion_matrix_gbm[1, 0]
TP=confusion_matrix_gbm[1, 1]
#false negative rate error type I
fnr_gbm=FN/(FN+ TP)
#False negaive rate error type II
fpr_gbm=FP/(FP+TN)
print ("Erreur type I: FP False Positive rate: %.3g" % fnr_gbm )
print ("Erreur type II: FP false negative rate: %.3g" % fpr_gbm )
print ("\n ")
print ("Confusion Matrix: ")
print (" Predicted")
print (" | 0 | 1 |")
print (" |-----|-----|")
print (" 0 | %3d | %3d |" % (confusion_matrix_gbm[0, 0],
confusion_matrix_gbm[0, 1]))
print ("Actual |-----|-----|")
print (" 1 | %3d | %3d |" % (confusion_matrix_gbm[1, 0],
confusion_matrix_gbm[1, 1]))
print (" |-----|-----|")
print ("\n ")
from sklearn.metrics import classification_report
print(classification_report(y_test,
predict_y_gbm,
digits=3))
print("Train !")
print(classification_report(y_train,
gbm.predict(X_train),
digits=3))
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fp_gbm, tp_gbm, 'b', label = 'AUC = %0.3f' % AUC_gbm)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
fig = plt.figure(figsize=(10,4), dpi=1600)
feat_imp_gbm= pd.Series(gbm.feature_importances_,selected_features )[1:55].sort_values(ascending=False)
feat_imp_gbm.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
################ Bays ################
######################################
start_time = time.time()
X_train, X_test, y_train, y_test = train_test_split(X_corr, y, test_size=0.3)
from sklearn.naive_bayes import BernoulliNB
#Create a Gaussian Classifier
model = BernoulliNB()
# Train the model using the training sets
bays= model.fit(X_train, y_train)
print("--- %s seconds ---" % (time.time() - start_time))
# Predict and get the accuracy score
from sklearn import metrics
predict_y_bays = bays.predict(X_test)
model_score_bays = bays.score(X_test, y_test)
# calculate the fpr and tpr for all thresholds of the classification
probs_bays = bays.predict_proba(X_test)
preds_bays = probs_bays[:,1]
fp_bays, tp_bays, threshold_bays = metrics.roc_curve(y_test, preds_bays)
AUC_bays = metrics.auc(fp_bays, tp_bays)
#K-folds cross validaion
cv_score_bays = cross_validation.cross_val_score(bays, X_train , y_train, cv=3, scoring='roc_auc')
bays=None
AUC_bays=None
fp_random_bays=None
tp_random_bays=None
threshold_random_bays =None
preds_random_bays=None
model_score_bays=None
predict_y_bays=None
forest_bays=None
RSS_bays =None
cv_score_bays =None
y_hat_bays=None
gc.collect()
cv_mean_bays =np.mean(cv_score_bays)
#Print model report:
print ("\nModel Report")
print ("Accuracy : %.3g" % model_score_bays )
print ("AUC Score (Test): %0.3f" % AUC_bays )
print ("CV Score : Mean - %.4g | Std - %.4g | Min - %.4g | Max - %.4g" % (np.mean(cv_score_bays),
np.std(cv_score_bays),
np.min(cv_score_bays),
np.max(cv_score_bays)))
confusion_matrix_bays = metrics.confusion_matrix(y_test, predict_y_bays)
import math
y_hat_bays= bays.predict(X_test)
RSS_bays = ((y_hat_bays - y_test) ** 2).sum()
k=X.shape[1]
AIC_bays= 2*k - 2*math.log(RSS_bays )
print( "Akaike information criterion: " + str(AIC_bays ))
print('somme erreur carré ' +str(RSS_bays))
TN= confusion_matrix_bays[0, 0]
FP=confusion_matrix_bays[0, 1]
FN=confusion_matrix_bays[1, 0]
TP=confusion_matrix_bays[1, 1]
#false negative rate error type I
fnr_bays=FN/(FN+ TP)
#False negaive rate error type II
fpr_bays=FP/(FP+TN)
print ("Erreur type I: FP False Positive rate: %.3g" % fnr_bays )
print ("Erreur type II: FP false negative rate: %.3g" % fpr_bays )
print ("\n ")
print ("Confusion Matrix: ")
print (" Predicted")
print (" | 0 | 1 |")
print (" |-----|-----|")
print (" 0 | %3d | %3d |" % (confusion_matrix_bays[0, 0],
confusion_matrix_bays[0, 1]))
print ("Actual |-----|-----|")
print (" 1 | %3d | %3d |" % (confusion_matrix_bays[1, 0],
confusion_matrix_bays[1, 1]))
print (" |-----|-----|")
print ("\n ")
from sklearn.metrics import classification_report
print(classification_report(y_test,
predict_y_bays))
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fp_bays, tp_bays, 'b', label = 'AUC = %0.3f' % AUC_bays)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0,
1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
################ Discriminante analysis ################
###################################################
################ arbre de decision ################
###################################################
Rank= [False,False,False,False,False,False,False,False,False,False,False,False
,False,False,True,True,True,True,True,True,True,True,True,True
,True,True,True,True,True,True,True,True,True,True,True,True
,True,True,True,True,True,True,True,True,True,True,True,True
,True,False,False,False,False,False,False,False,False,False,False,False
,False,False,False,False,False,False,False,False,False,False,False,False
,False,False,False,False,False,False,False,False,False,False,False,False
,False,False,False,False,False,False,False,False,False,False,False,False
,False,False,False,False,False,False,False,False,False,False,True,True
,True,True,True,True,True,True,True,True,True,True,True,True
,True,True,True,True,True,True,True,True,True,True,True,True
,True,True,True,True,True,True,True,True,True]
X = pd.concat([num, col_categoric_dummies],axis=1)
data={ 'Features': X.columns ,'Rank': Rank}
rank=pd.DataFrame( data)
li=rank.loc[rank.Rank ]
X_train, X_test, y_train, y_test = train_test_split(X[li.Features], y, test_size =0.3)
import time
start_time = time.time()
from sklearn import tree
DT= tree.DecisionTreeClassifier( max_depth=7, min_samples_split=150,min_samples_leaf= 100)
DT.fit(X_train, y_train)
print("--- %s seconds ---" % (time.time() - start_time))
import time start_time = time.time()
col_categoric= churn[['Gouvernorat','offer', 'device_type_name','Offer_nature_parent', 'Touriste', 'Business']] col_categoric_dummies = pd.get_dummies( col_categoric ) X = np.concatenate([col_numeric, col_categoric_dummies],axis=1)
from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.3 , stratify=y) from sklearn import tree DT=tree.DecisionTreeClassifier() DT.fit(X_train, y_train) print("--- %s seconds ---" % (time.time() - start_time))
# Predict and get the accuracy score
from sklearn import metrics
predict_y_DT = DT.predict(X_test)
model_score_DT = DT.score(X_test, y_test)
# calculate the fpr and tpr for all thresholds of the classification
probs_DT = DT.predict_proba(X_test)
preds_DT = probs_DT[:,1]
fp_DT, tp_DT, threshold_DT = metrics.roc_curve(y_test, preds_DT)
AUC_DT = metrics.auc(fp_DT, tp_DT)
#K-folds cross validaion
cv_score_DT = cross_validation.cross_val_score(DT, X_train , y_train, cv=3, scoring='roc_auc')
cv_mean_DT =np.mean(cv_score_DT)
#Print model report:
print ("\nModel Report")
print ("Accuracy : %.3g" % model_score_DT )
print ("AUC Score (Test): %0.3f" % AUC_DT )
print ("CV Score : Mean - %.4g | Std - %.4g | Min - %.4g | Max - %.4g" % (np.mean(cv_score_DT),
np.std(cv_score_DT),
np.min(cv_score_DT),
np.max(cv_score_DT)))
confusion_matrix_DT = metrics.confusion_matrix(y_test, predict_y_DT)
import math
y_hat_DT= DT.predict(X_test)
RSS_DT = ((y_hat_DT - y_test) ** 2).sum()
k=X.shape[1]
AIC_DT= 2*k - 2*math.log(RSS_DT )
print( "Akaike information criterion: " + str(AIC_DT ))
print('somme erreur carré ' +str(RSS_DT))
TN= confusion_matrix_DT[0, 0]
FP=confusion_matrix_DT[0, 1]
FN=confusion_matrix_DT[1, 0]
TP=confusion_matrix_DT[1, 1]
#false negative rate error type I
fnr_DT=FN/(FN+ TP)
#False negaive rate error type II
fpr_DT=FP/(FP+TN)
print ("Erreur type I: FP False Positive rate: %.3g" % fnr_DT )
print ("Erreur type II: FP false negative rate: %.3g" % fpr_DT )
print ("\n ")
print ("Confusion Matrix: ")
print (" Predicted")
print (" | 0 | 1 |")
print (" |------|------|")
print (" 0 | %3d | %3d |" % (confusion_matrix_DT[0, 0],
confusion_matrix_DT[0, 1]))
print ("Actual |------|------|")
print (" 1 | %3d | %3d |" % (confusion_matrix_DT[1, 0],
confusion_matrix_DT[1, 1]))
print (" |------|------|")
print ("\n ")
from sklearn.metrics import classification_report
print(classification_report(y_test,
predict_y_DT,
digits=3))
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fp_DT, tp_DT, 'b', label = 'AUC = %0.3f' % AUC_DT)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
################################ adaptative boossting ################################
######################################################################################
Rank= [True,True,True,True,True,True,True,True,True,True,True,True
,True,True,True,True,True,True,True,True,True,True,True,True
,True,False,True,True,False,False,True,True,True,False,False,True
,False,True,True,True,False,False,False,True,True,False,False,False
,False,False,False,False,False,False,False,True,False,False,False,False
,False,False,False,False,False,False,True,False,False,False,False,False
,False,False,False,True,True,True,True,True,True,True,True,True
,True,True,True,True,False,False,True,False,False,False,True,True
,False,False,True,False,False,False,True,False,True,False,False,False
,False,True,True,True,False,True,False,False,False,False,False,False
,True,False,True,True,True,True,True,True,True,True,False,False
,False,False,False,False,False,False,False,False,False]
data={ 'Features': X.columns ,'Rank': Rank}
rank=pd.DataFrame( data)
li=rank.loc[rank.Rank ]
import time
start_time = time.time()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X[li.Features], y, test_size =0.3)
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier( n_estimators=400,algorithm='SAMME.R', learning_rate=.6)
ada.fit(X_train, y_train)
print("--- %s seconds ---" % (time.time() - start_time))
# Predict and get the accuracy score
from sklearn import metrics
predict_y_ada = ada.predict(X_test)
model_score_ada = ada.score(X_test, y_test)
# calculate the fpr and tpr for all thresholds of the classification
probs_ada = ada.predict_proba(X_test)
preds_ada = probs_ada[:,1]
fp_ada, tp_ada, threshold_ada = metrics.roc_curve(y_test, preds_ada)
AUC_ada = metrics.auc(fp_ada, tp_ada)
#K-folds cross validaion
cv_score_ada = cross_validation.cross_val_score(ada, X_train , y_train, cv=3, scoring='roc_auc')
cv_mean_ada =np.mean(cv_score_ada)
#Print model report:
print ("\nModel Report")
print ("Accuracy : %.3g" % model_score_ada )
print ("AUC Score (Test): %0.3f" % AUC_ada )
print ("CV Score : Mean - %.4g | Std - %.4g | Min - %.4g | Max - %.4g" % (np.mean(cv_score_ada),
np.std(cv_score_ada),
np.min(cv_score_ada),
np.max(cv_score_ada)))
import math
RSS_ada = (( predict_y_ada - y_test) ** 2).sum()
k=X.shape[1]
AIC_ada= 2*k - 2*math.log(RSS_ada )
print( "Akaike information criterion: " + str(AIC_ada ))
print('somme erreur carré ' +str(RSS_ada))
confusion_matrix_ada = metrics.confusion_matrix(y_test, predict_y_ada)
TN= confusion_matrix_ada[0, 0]
FP=confusion_matrix_ada[0, 1]
FN=confusion_matrix_ada[1, 0]
TP=confusion_matrix_ada[1, 1]
#false negative rate error type I
fnr_ada=FN/(FN+ TP)
#False negaive rate error type II
fpr_ada=FP/(FP+TN)
print ("Erreur type I: FP False Positive rate: %.3g" % fnr_ada )
print ("Erreur type II: FP false negative rate: %.3g" % fpr_ada )
print ("\n ")
print ("Confusion Matrix: ")
print (" Predicted")
print (" | 0 | 1 |")
print (" |------|------|")
print (" 0 | %3d | %3d |" % (confusion_matrix_ada[0, 0],
confusion_matrix_ada[0, 1]))
print ("Actual |------|------|")
print (" 1 | %3d | %3d |" % (confusion_matrix_ada[1, 0],
confusion_matrix_ada[1, 1]))
print (" |------|------|")
print ("\n ")
from sklearn.metrics import classification_report
print(classification_report(y_test,
predict_y_ada,
digits=3))
print('train ')
print(classification_report(y_train,
ada.predict(X_train),
digits=3))
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fp_ada, tp_ada, 'b', label = 'AUC = %0.3f' % AUC_ada)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
################################ ExtraTrees ################################
######################################################################################
Rank= [True,True,True,True,False,False,False,False,True,True,True,False
,True,True,True,True,False,False,False,True,True,True,True,True
,True,True,True,True,False,False,True,True,True,True,True,True
,True,True,True,False,False,True,True,True,True,False,False,True
,True,True,True,False,False,False,True,True,True,False,False,True
,True,True,True,False,False,False,True,True,True,False,False,True
,True,True,True,False,False,False,True,False,False,False,False,False
,False,False,False,False,False,False,False,True,False,False,False,False
,True,False,False,False,False,False,False,False,False,False,False,False
,False,False,False,False,False,True,False,False,False,False,False,False
,True,False,True,False,False,False,True,True,False,True,True,True
,True,True,True,True,True,True,True,False,True]
data={ 'Features': X.columns ,'Rank': Rank}
rank=pd.DataFrame( data)
li=rank.loc[rank.Rank ]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.3)
import time
start_time = time.time()
from sklearn.ensemble import ExtraTreesClassifier
extra= ExtraTreesClassifier( n_estimators=500, max_depth=30, min_samples_split=50, min_samples_leaf=25)
extra.fit(X_train, y_train)
print("--- %s seconds ---" % (time.time() - start_time))
# Predict and get the accuracy score
from sklearn import metrics
predict_y_extra = extra.predict(X_test)
model_score_extra = extra.score(X_test, y_test)
# calculate the fpr and tpr for all thresholds of the classification
probs_extra = extra.predict_proba(X_test)
preds_extra = probs_extra[:,1]
fp_extra, tp_extra, threshold_extra = metrics.roc_curve(y_test, preds_extra)
AUC_extra = metrics.auc(fp_extra, tp_extra)
#K-folds cross validaion
cv_score_extra = cross_validation.cross_val_score(extra, X_train , y_train, cv=3, scoring='roc_auc')
cv_mean_extra=np.mean(cv_score_extra)
#Print model report:
print ("\nModel Report")
print ("Accuracy : %.3g" % model_score_extra )
print ("AUC Score (Test): %0.3f" % AUC_extra )
print ("CV Score : Mean - %.4g | Std - %.4g | Min - %.4g | Max - %.4g" % (np.mean(cv_score_extra),
np.std(cv_score_extra),
np.min(cv_score_extra),
np.max(cv_score_extra)))
import math
y_hat= extra.predict(X_test)
RSS_extra = ((y_hat - y_test) ** 2).sum()
k=X.shape[1]
AIC_extra= 2*k - 2*math.log(RSS_extra )
print( "Akaike information criterion: " + str(AIC_extra ))
print('somme erreur carré ' +str(RSS_extra))
confusion_matrix_extra = metrics.confusion_matrix(y_test, predict_y_extra)
TN= confusion_matrix_extra[0, 0]
FP=confusion_matrix_extra[0, 1]
FN=confusion_matrix_extra[1, 0]
TP=confusion_matrix_extra[1, 1]
#false negative rate error type I
fnr_extra=FN/(FN+ TP)
#False negaive rate error type II
fpr_extra=FP/(FP+TN)
print ("Erreur type I: FP False Positive rate: %.3g" % fnr_extra )
print ("Erreur type II: FP false negative rate: %.3g" % fpr_extra )
print ("\n ")
print ("Confusion Matrix: ")
print (" Predicted")
print (" | 0 | 1 |")
print (" |------|------|")
print (" 0 | %3d | %3d |" % (confusion_matrix_extra[0, 0],
confusion_matrix_extra[0, 1]))
print ("Actual |------|------|")
print (" 1 | %3d | %3d |" % (confusion_matrix_extra[1, 0],
confusion_matrix_extra[1, 1]))
print (" |------|------|")
print ("\n ")
from sklearn.metrics import classification_report
print(classification_report(y_test,
predict_y_extra,
digits=3))
print('train ')
print(classification_report(y_train,
extra.predict(X_train),
digits=3))
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fp_extra, tp_extra, 'b', label = 'AUC = %0.3f' % AUC_extra)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
####################### bagging clasifier #######################
X = pd.concat([num, col_categoric_dummies],axis=1)
Rank= [True,True,True,True,False,False,False,False,True,True,True,True
,True,False,True,True,True,True,True,True,True,True,True,True
,True,True,True,True,True,True,True,True,True,True,True,True
,True,True,True,True,True,True,True,True,True,True,True,False
,True,True,True,True,False,True,True,True,True,True,True,False
,False,True,True,True,True,True,True,True,True,True,True,False
,False,True,True,True,False,True,True,False,False,False,False,False
,False,False,False,False,False,False,False,True,False,False,False,False
,False,False,False,False,False,False,False,False,False,False,False,False
,False,False,False,False,False,True,False,False,False,False,False,False
,False,False,False,False,False,False,False,True,False,False,False,False
,False,False,False,False,False,False,False,False,False]
data={ 'Features': X.columns ,'Rank': Rank}
rank=pd.DataFrame( data)
li=rank.loc[rank.Rank ]
selected_features=li.Features
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X[selected_features], y, test_size =0.3 , stratify=y)
len(selected_features)
from sklearn.ensemble import BaggingClassifier
import time
start_time = time.time()
from sklearn.ensemble import BaggingClassifier
bagging = BaggingClassifier( n_estimators=400,max_samples= 0.2,bootstrap=False)
bagging.fit(X_train, y_train)
print("--- %s seconds ---" % (time.time() - start_time))
# Predict and get the accuracy score
from sklearn import metrics
predict_y_bagging = bagging.predict(X_test)
model_score_bagging = bagging.score(X_test, y_test)
# calculate the fpr and tpr for all thresholds of the classification
probs_bagging = bagging.predict_proba(X_test)
preds_bagging = probs_bagging[:,1]
fp_bagging, tp_bagging, threshold_bagging = metrics.roc_curve(y_test, preds_bagging)
AUC_bagging = metrics.auc(fp_bagging, tp_bagging)
#K-folds cross validaion
cv_score_bagging = cross_validation.cross_val_score(bagging, X_train , y_train, cv=3, scoring='roc_auc')
cv_mean_bagging=np.mean(cv_score_bagging)
#Print model report:
print ("\nModel Report")
print ("Accuracy : %.3g" % model_score_bagging )
print ("AUC Score (Test): %0.3f" % AUC_bagging )
print ("CV Score : Mean - %.4g | Std - %.4g | Min - %.4g | Max - %.4g" % (np.mean(cv_score_bagging),
np.std(cv_score_bagging),
np.min(cv_score_bagging),
np.max(cv_score_bagging)))
confusion_matrix_bagging = metrics.confusion_matrix(y_test, predict_y_bagging)
import math
RSS_bagging = ((predict_y_bagging - y_test) ** 2).sum()
k=X.shape[1]
AIC_bagging= 2*k - 2*math.log(RSS_bagging )
print( "Akaike information criterion: " + str(AIC_bagging ))
print('somme erreur carré ' +str(RSS_bagging))
TN= confusion_matrix_bagging[0, 0]
FP=confusion_matrix_bagging[0, 1]
FN=confusion_matrix_bagging[1, 0]
TP=confusion_matrix_bagging[1, 1]
#false negative rate error type I
fnr_bagging=FN/(FN+ TP)
#False negaive rate error type II
fpr_bagging=FP/(FP+TN)
print ("Erreur type I: FP False Positive rate: %.3g" % fnr_bagging )
print ("Erreur type II: FP false negative rate: %.3g" % fpr_bagging )
print ("\n ")
print ("Confusion Matrix: ")
print (" Predicted")
print (" | 0 | 1 |")
print (" |-----|-----|")
print (" 0 | %3d | %3d |" % (confusion_matrix_bagging[0, 0],
confusion_matrix_bagging[0, 1]))
print ("Actual |-----|-----|")
print (" 1 | %3d | %3d |" % (confusion_matrix_bagging[1, 0],
confusion_matrix_bagging[1, 1]))
print (" |-----|-----|")
print ("\n ")
from sklearn.metrics import classification_report
print(classification_report(y_test,
predict_y_bagging,
digits=3))
print("Train !")
print(classification_report(y_train,
bagging.predict(X_train),
digits=3))
table=pd.DataFrame(columns=['Model','AUC','Accuracy' ,'AIC','somme erreur carré','CV AUC mean','Error I','Error II'])
table.loc[0]=['Random Forest',AUC_random_forest, model_score_random_forest, AIC_random_forest, RSS_random_forest, cv_mean_random_forest ,fnr_random_forest, fpr_random_forest]
table.loc[0]=['Bagging Tree',AUC_bagging, model_score_bagging, AIC_bagging, RSS_bagging, cv_mean_bagging ,fnr_bagging, fpr_bagging]
table.loc[1]=['Multilayer Perceptron',AUC_mlp, model_score_mlp, AIC_mlp,RSS_mlp, cv_mean_mlp , fnr_mlp, fpr_mlp]
table.loc[2]=['logistic Regression', AUC_logreg , model_score_logreg, AIC_logreg, RSS_logreg, cv_mean_logreg ,fnr_logreg, fpr_logreg]
table.loc[3]=['Gradient Tree Boosting',AUC_gbm, model_score_gbm, AIC_gbm,RSS_gbm, cv_mean_gbm , fnr_gbm, fpr_gbm]
table.loc[4]=['Naive Bayes ', AUC_bays, model_score_bays, AIC_bays,RSS_bays, cv_mean_bays , fnr_bays, fpr_bays]
table.loc[6]=['Decision trees', AUC_DT, model_score_DT, AIC_DT,RSS_DT, cv_mean_DT , fnr_DT, fpr_DT]
table.loc[5]=['Adaptive Boosting', AUC_ada, model_score_ada, AIC_ada, RSS_ada , cv_mean_ada , fnr_ada, fpr_ada]
table.loc[7]=['Extra Tree',AUC_extra, model_score_extra, AIC_extra, RSS_extra , cv_mean_extra , fnr_extra, fpr_extra]
#table.loc[8]=['svm', AUC_svm, model_score_svm, AUC_svm, AIC_svm, RSS_svm , 0 , fnr_svm, fpr_svm]
table
AIC_random_forest
# method I: plt
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(10,4), dpi=1600)
plt.title('Courbe du ROC')
plt.plot(fp_extra, tp_extra, 'b', label = 'Extra Tree = %0.3f' % AUC_extra,color='c')
plt.plot(fp_random_forest, tp_random_forest, 'b', label = 'Random forest = %0.3f' % AUC_random_forest,color='silver')
plt.plot(fp_bagging, tp_bagging, 'b', label = 'Bagging Tree = %0.3f' % AUC_bagging,color='blue')
plt.plot(fp_mlp, tp_mlp, 'b', label = 'mlp = %0.3f' % AUC_mlp,color='yellow')
plt.plot(fp_logreg, tp_logreg, 'b', label = 'logistic Reg = %0.3f' % AUC_logreg ,color='green')
plt.plot(fp_gbm, tp_gbm, 'b', label = 'Gradient B = %0.3f' % AUC_gbm, color='deeppink')
plt.plot(fp_bays, tp_bays, 'b', label = ' Naive Bays = %0.3f' % AUC_bays, color='pink')
plt.plot(fp_DT, tp_DT, 'b', label = 'Decision Tree = %0.3f' % AUC_DT, color='gray')
plt.plot(fp_ada, tp_ada, 'b', label = 'AdaBoost = %0.3f' % AUC_ada, color='orange')
#plt.plot(fpr_svm, tpr_svm, 'b', label = 'AUC svm = %0.3f' % AUC_svm, color='lightcoral')
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
# AUC Score AIC somme erreur carré CV AUC mean Error I
fig = plt.figure(figsize=(15,6), dpi=1000)
# libraries
import numpy as np
import matplotlib.pyplot as plt
# set width of bar
barWidth = 0.15
# set height of bar
#bars1 = table['Score']
bars1 = table['CV AUC mean']
bars2 = table['Accuracy']
bars3 = table['Error I']
bars4 = table['Error II']
#bars4 = table['AUC']
# Set position of bar on X axis
r1 = np.arange(len(bars1))
r2 = [x + barWidth for x in r1]
r3 = [x + barWidth for x in r2]
r4 = [x + barWidth for x in r3]
#r5 = [x + barWidth for x in r4]
#r6 = [x + barWidth for x in r5]
# Make the plot
#plt.bar(r1, bars1, color='#FA8072', width=barWidth, edgecolor='white', label='Score')
plt.bar(r1, bars1, color='#2BDFBB', width=barWidth, edgecolor='white', label='CV AUC mean')
plt.bar(r2, bars2, color='#ffff66', width=barWidth, edgecolor='white', label='Score')
plt.bar(r3, bars3, color='#FF1493', width=barWidth, edgecolor='white', label='Error I')
plt.bar(r4, bars4, color='#FFCBA4', width=barWidth, edgecolor='white', label='Error II')
#plt.bar(r4, bars4, color='#008080', width=barWidth, edgecolor='white', label='AUC')
# Add xticks on the middle of the group bars
plt.xlabel('Compare models', fontweight='bold')
plt.xticks([r + barWidth for r in range(len(bars1))], ['Random Forest', 'Multilayer Perceptron',
'logistic Regression', 'Gradient Tree Boosting',
'Naive Bayes','Decision trees',
'Adaptive Boosting','Extra Tree','svm'], rotation=60)
for i in range (0,8):
plt.text(i,.7, round(bars1[i], 2),ma= 'center', ha= 'center', rotation=90, va='top' , size = 'large' )
plt.text(0.2+i,.7, round(bars2[i], 2),ma= 'center', ha= 'center', rotation=90, va='top' , size = 'large')
plt.text(0.3+i,.3, round(bars3[i], 2),ma= 'center', ha= 'center', rotation=90, va='top' )
plt.text(0.5+i,.3, round(bars4[i], 2),ma= 'center', ha= 'center', rotation=90, va='top' )
# Create legend & Show graphic
plt.legend(loc= 'best')
plt.show()
bars2